R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
library(broom)
library(splines)
library(purrr)
### define the fine spatial grids for each GP, the spatial resolution
### will be between -2 and 2 for all dimensions for simplicity
num_fine_int <- 80

fine_grid_list <- list(
  x1 = seq(-2, 2, length.out = num_fine_int+1)
) 
fine_grid_list
## $x1
##  [1] -2.00 -1.95 -1.90 -1.85 -1.80 -1.75 -1.70 -1.65 -1.60 -1.55 -1.50 -1.45
## [13] -1.40 -1.35 -1.30 -1.25 -1.20 -1.15 -1.10 -1.05 -1.00 -0.95 -0.90 -0.85
## [25] -0.80 -0.75 -0.70 -0.65 -0.60 -0.55 -0.50 -0.45 -0.40 -0.35 -0.30 -0.25
## [37] -0.20 -0.15 -0.10 -0.05  0.00  0.05  0.10  0.15  0.20  0.25  0.30  0.35
## [49]  0.40  0.45  0.50  0.55  0.60  0.65  0.70  0.75  0.80  0.85  0.90  0.95
## [61]  1.00  1.05  1.10  1.15  1.20  1.25  1.30  1.35  1.40  1.45  1.50  1.55
## [73]  1.60  1.65  1.70  1.75  1.80  1.85  1.90  1.95  2.00
fine_grid <- fine_grid_list %>% as.data.frame() %>% tibble::as_tibble()
fine_basis_mat <- model.matrix(~ ns(x1, df = 8) -1, data = fine_grid)
colnames(fine_basis_mat) 
## [1] "ns(x1, df = 8)1" "ns(x1, df = 8)2" "ns(x1, df = 8)3" "ns(x1, df = 8)4"
## [5] "ns(x1, df = 8)5" "ns(x1, df = 8)6" "ns(x1, df = 8)7" "ns(x1, df = 8)8"
fine_basis_mat %>% as.data.frame() %>% tbl_df() %>%
  mutate(x1 = fine_grid$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

set.seed(4545)
beta_true <- rnorm(n = ncol(fine_basis_mat), mean = 0, sd = 3)
beta_true
## [1] -4.7034162  1.7847890  3.8856362  1.0085650 -4.8070812  6.3484287 -1.0373638
## [8] -0.4311326
mean_trend_true <- as.numeric(fine_basis_mat %*% as.matrix(beta_true))
mean_trend_true
##  [1]  0.00000000 -0.21806277 -0.43646885 -0.65556154 -0.87568414 -1.09717997
##  [7] -1.32039232 -1.54566450 -1.77333981 -2.00376157 -2.23727307 -2.47260126
## [13] -2.70200764 -2.91613732 -3.10563544 -3.26114712 -3.37331751 -3.43279171
## [19] -3.43021488 -3.35623212 -3.20148858 -2.96042168 -2.64263805 -2.26153660
## [25] -1.83051627 -1.36297597 -0.87231463 -0.37193118  0.12477546  0.60440638
## [31]  1.05356264  1.46098004  1.82393331  2.14183187  2.41408518  2.64010266
## [37]  2.81929376  2.95106792  3.03483457  3.07000317  3.05598313  2.99262224
## [43]  2.88152150  2.72472026  2.52425787  2.28217366  2.00050699  1.68129720
## [49]  1.32658363  0.93840561  0.51880251  0.07268969 -0.38351331 -0.83051092
## [55] -1.24900757 -1.61970769 -1.92331571 -2.14053606 -2.25207317 -2.23863146
## [61] -2.08091538 -1.76856062 -1.32692797 -0.79030951 -0.19299729  0.43071661
## [67]  1.04654014  1.62018121  2.11734777  2.50374776  2.74508909  2.81578424
## [73]  2.72506375  2.49086269  2.13111613  1.66375914  1.10672680  0.47795417
## [79] -0.20462367 -0.92307165 -1.65945470
fine_grid_df <- fine_grid %>% mutate(mean_trend = mean_trend_true)
fine_grid_df
## # A tibble: 81 x 2
##       x1 mean_trend
##    <dbl>      <dbl>
##  1 -2         0    
##  2 -1.95     -0.218
##  3 -1.9      -0.436
##  4 -1.85     -0.656
##  5 -1.8      -0.876
##  6 -1.75     -1.10 
##  7 -1.7      -1.32 
##  8 -1.65     -1.55 
##  9 -1.6      -1.77 
## 10 -1.55     -2.00 
## # … with 71 more rows
fine_grid_df %>%
  ggplot(mapping = aes(x = x1, y = mean_trend)) +
  geom_line(size = 1.15) +
  theme_bw()

### generate the noisy observations
sd_noise_1d <- 0.4 # noise

set.seed(65646)
fine_df_1d <- fine_grid_df %>% 
  mutate(y = rnorm(n = n(), mean = mean_trend, sd = sd_noise_1d))
fine_df_1d
## # A tibble: 81 x 3
##       x1 mean_trend      y
##    <dbl>      <dbl>  <dbl>
##  1 -2         0      0.548
##  2 -1.95     -0.218 -0.674
##  3 -1.9      -0.436 -0.521
##  4 -1.85     -0.656 -0.573
##  5 -1.8      -0.876 -0.698
##  6 -1.75     -1.10  -1.58 
##  7 -1.7      -1.32  -0.987
##  8 -1.65     -1.55  -1.06 
##  9 -1.6      -1.77  -2.49 
## 10 -1.55     -2.00  -1.90 
## # … with 71 more rows
### work with a coarse grid instead of all of the points in the fine grid
num_coarse_int <- 40

coarse_grid_list <- list(
  x1 = seq(-2, 2, length.out = num_coarse_int+1)
)
coarse_grid <- coarse_grid_list %>% as.data.frame() %>% tibble::as_tibble()
coarse_grid
## # A tibble: 41 x 1
##       x1
##    <dbl>
##  1 -2   
##  2 -1.9 
##  3 -1.8 
##  4 -1.7 
##  5 -1.6 
##  6 -1.5 
##  7 -1.4 
##  8 -1.30
##  9 -1.2 
## 10 -1.1 
## # … with 31 more rows
train_df_1d <- fine_df_1d %>% 
  right_join(coarse_grid, by = c("x1"))

train_df_1d
## # A tibble: 41 x 3
##       x1 mean_trend      y
##    <dbl>      <dbl>  <dbl>
##  1 -2         0      0.548
##  2 -1.9      -0.436 -0.521
##  3 -1.8      -0.876 -0.698
##  4 -1.7      -1.32  -0.987
##  5 -1.6      -1.77  -2.49 
##  6 -1.5      -2.24  -2.32 
##  7 -1.4      -2.70  -3.03 
##  8 -1.30     -3.11  -3.45 
##  9 -1.2      -3.37  -3.70 
## 10 -1.1      -3.43  -2.76 
## # … with 31 more rows
train_df_1d %>%
  ggplot(mapping = aes(x = x1, y = mean_trend)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

Using ThinPlateSplines for fitting the above function

ThinPlateSplines_fit_func_No_Reg <- function(df_1, data){
  model <- lm(y ~ smoothCon(s(x1, k=df_1, bs = 'ts'), data = data)[[1]]$X - 1, data = data)
  metrics <- glance(model) %>% mutate(df_1 = df_1)
  return(metrics)
}
df_grid<-  list(x1 = c(5:18)) %>% as.data.frame() %>% tbl_df()
df_grid
## # A tibble: 14 x 1
##       x1
##    <int>
##  1     5
##  2     6
##  3     7
##  4     8
##  5     9
##  6    10
##  7    11
##  8    12
##  9    13
## 10    14
## 11    15
## 12    16
## 13    17
## 14    18
ThinPlateSplines_comp_fit <- map_dfr(df_grid$x1, ThinPlateSplines_fit_func_No_Reg, data = train_df_1d)
ThinPlateSplines_comp_fit
## # A tibble: 14 x 13
##    r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##        <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
##  1     0.605         0.551 1.41       11.1 1.69e- 6     5 -69.7  151.  162. 
##  2     0.755         0.713 1.13       18.0 2.19e- 9     6 -60.0  134.  146. 
##  3     0.907         0.888 0.706      47.5 1.03e-15     7 -40.1   96.1 110. 
##  4     0.938         0.923 0.587      62.1 1.12e-17     8 -31.9   81.8  97.2
##  5     0.970         0.962 0.411     117.  6.92e-22     9 -16.6   53.2  70.4
##  6     0.977         0.970 0.369     131.  1.68e-22    10 -11.5   45.0  63.9
##  7     0.977         0.969 0.374     116.  1.92e-21    11 -11.5   46.9  67.5
##  8     0.977         0.968 0.380     103.  2.06e-20    12 -11.4   48.8  71.1
##  9     0.977         0.967 0.383      93.4 1.67e-19    13 -11.0   50.1  74.1
## 10     0.980         0.969 0.372      92.5 4.47e-19    14  -9.03  48.1  73.8
## 11     0.981         0.971 0.362      91.1 1.39e-18    15  -7.19  46.4  73.8
## 12     0.982         0.970 0.366      83.6 1.10e-17    16  -6.83  47.7  76.8
## 13     0.983         0.970 0.364      79.8 5.44e-17    17  -5.73  47.5  78.3
## 14     0.983         0.970 0.366      74.2 3.66e-16    18  -5.17  48.3  80.9
## # … with 4 more variables: deviance <dbl>, df.residual <int>, nobs <int>,
## #   df_1 <int>
df_best_ThinPlateSplines_AIC <- ThinPlateSplines_comp_fit %>% filter(AIC == min(AIC)) %>% select(df_1)
df_best_ThinPlateSplines_AIC
## # A tibble: 1 x 1
##    df_1
##   <int>
## 1    10
ThinPlateSplines_smooth_Best_AIC = smoothCon(s(x1, k=df_best_ThinPlateSplines_AIC$df_1, bs = 'ts'), data = train_df_1d)[[1]]
ThinPlateSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 10
##        V1    V2    V3    V4     V5    V6     V7     V8    V9    V10
##     <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl> <dbl>  <dbl>
##  1 -1.58   1.65 1.82   1.95 -1.95  2.07  -2.02  -2.14      1 -1.69 
##  2 -1.50   1.63 1.73   1.92 -1.86  2.05  -1.92  -2.11      1 -1.61 
##  3 -1.42   1.61 1.63   1.89 -1.75  2.01  -1.80  -2.06      1 -1.52 
##  4 -1.34   1.59 1.54   1.86 -1.63  1.94  -1.65  -1.95      1 -1.44 
##  5 -1.26   1.56 1.44   1.80 -1.50  1.83  -1.47  -1.78      1 -1.35 
##  6 -1.18   1.53 1.34   1.72 -1.34  1.68  -1.25  -1.56      1 -1.27 
##  7 -1.11   1.49 1.23   1.62 -1.16  1.49  -1.02  -1.33      1 -1.18 
##  8 -1.04   1.45 1.11   1.49 -0.972 1.27  -0.798 -1.12      1 -1.10 
##  9 -0.966  1.40 0.990  1.33 -0.775 1.06  -0.617 -0.978     1 -1.01 
## 10 -0.899  1.34 0.861  1.15 -0.587 0.868 -0.493 -0.905     1 -0.930
## # … with 31 more rows
ThinPlateSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble() %>%
  mutate(x1 = train_df_1d$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")

ThinPlateSplines_train_df <- ThinPlateSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
ThinPlateSplines_smooth_Best_AIC_fit <- lm(y ~ . - 1, data = ThinPlateSplines_train_df)
ThinPlateSplines_smooth_Best_AIC_fit %>% summary()
## 
## Call:
## lm(formula = y ~ . - 1, data = ThinPlateSplines_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57779 -0.27556 -0.03859  0.20834  0.77819 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## V1   15.3868     3.1347   4.909 2.79e-05 ***
## V2   -6.4735     0.5236 -12.363 1.61e-13 ***
## V3   -1.3574     0.9152  -1.483  0.14812    
## V4   -5.1303     0.3352 -15.304 5.35e-16 ***
## V5   -7.9014     0.5062 -15.609 3.11e-16 ***
## V6   -2.8075     0.4251  -6.604 2.23e-07 ***
## V7   -3.7503     0.5657  -6.630 2.07e-07 ***
## V8    1.5624     0.5277   2.961  0.00584 ** 
## V9   10.5557     1.6547   6.379 4.19e-07 ***
## V10 -13.8512     1.2411 -11.161 2.19e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3685 on 31 degrees of freedom
## Multiple R-squared:  0.9769, Adjusted R-squared:  0.9695 
## F-statistic: 131.3 on 10 and 31 DF,  p-value: < 2.2e-16
coefplot::coefplot(ThinPlateSplines_smooth_Best_AIC_fit) + theme_bw()

ThinPlateSplines_smooth_Best_AIC_fit %>% glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.977         0.970 0.369      131. 1.68e-22    10  -11.5  45.0  63.9
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Prediction

ThinPlateSplines_test_basis_df <- PredictMat(ThinPlateSplines_smooth_Best_AIC, fine_df_1d) %>% as.data.frame() %>% tbl_df()
ThinPlateSplines_test_pred <- predict(ThinPlateSplines_smooth_Best_AIC_fit, newdata = ThinPlateSplines_test_basis_df)
fine_df_1d %>%
  mutate(y_pred = ThinPlateSplines_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

Overfitted ThinPlateSplines model

ThinPlateSplines_smooth_overfit = smoothCon(s(x1, k=30, bs = 'ts'), data = train_df_1d)[[1]]
ThinPlateSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 30
##        V1    V2    V3    V4     V5    V6    V7    V8    V9   V10    V11    V12
##     <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>
##  1 -1.61  0.775 1.78  1.04  -1.92  1.16  1.99  1.22  2.03  1.26  -2.06  -1.29 
##  2 -1.53  0.796 1.69  1.06  -1.82  1.18  1.89  1.24  1.92  1.28  -1.95  -1.30 
##  3 -1.45  0.818 1.60  1.08  -1.72  1.19  1.77  1.23  1.79  1.25  -1.79  -1.24 
##  4 -1.37  0.837 1.51  1.09  -1.60  1.16  1.62  1.17  1.60  1.14  -1.56  -1.09 
##  5 -1.30  0.854 1.41  1.07  -1.47  1.10  1.43  1.04  1.36  0.958 -1.29  -0.882
##  6 -1.22  0.866 1.31  1.04  -1.31  0.992 1.22  0.869 1.11  0.761 -1.04  -0.716
##  7 -1.14  0.872 1.20  0.978 -1.13  0.844 0.984 0.680 0.881 0.608 -0.881 -0.650
##  8 -1.07  0.870 1.08  0.888 -0.940 0.671 0.765 0.517 0.721 0.538 -0.810 -0.663
##  9 -0.999 0.859 0.959 0.771 -0.744 0.496 0.586 0.413 0.639 0.543 -0.784 -0.672
## 10 -0.931 0.837 0.831 0.632 -0.557 0.343 0.463 0.378 0.616 0.571 -0.735 -0.601
## # … with 31 more rows, and 18 more variables: V13 <dbl>, V14 <dbl>, V15 <dbl>,
## #   V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
## #   V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
## #   V28 <dbl>, V29 <dbl>, V30 <dbl>
ThinPlateSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble() %>%
  mutate(x1 = train_df_1d$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")

ThinPlateSplines__overfit_train_df <- ThinPlateSplines_smooth_overfit$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
ThinPlateSplines_smooth_overfit_fit <- lm(y ~ . - 1, data = ThinPlateSplines__overfit_train_df)
ThinPlateSplines_smooth_overfit_fit %>% summary()
## 
## Call:
## lm(formula = y ~ . - 1, data = ThinPlateSplines__overfit_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.39433 -0.16298  0.00417  0.14536  0.42438 
## 
## Coefficients:
##      Estimate Std. Error t value Pr(>|t|)    
## V1   19.25274   17.22939   1.117 0.287620    
## V2   -9.10312    2.03978  -4.463 0.000958 ***
## V3   -3.13689    4.22109  -0.743 0.472966    
## V4   -5.71051    0.47887 -11.925 1.24e-07 ***
## V5   -7.50099    1.00034  -7.498 1.20e-05 ***
## V6   -3.04956    0.44389  -6.870 2.69e-05 ***
## V7    3.57004    0.67840   5.262 0.000267 ***
## V8   -1.69355    0.52718  -3.212 0.008269 ** 
## V9   -0.26929    0.68976  -0.390 0.703689    
## V10  -0.24306    0.62498  -0.389 0.704769    
## V11   0.59326    0.76645   0.774 0.455226    
## V12  -1.15558    0.72297  -1.598 0.138265    
## V13  -1.25455    0.85634  -1.465 0.170905    
## V14   0.60009    0.81719   0.734 0.478103    
## V15  -1.10174    0.94608  -1.165 0.268846    
## V16   0.76193    0.90525   0.842 0.417887    
## V17   0.34213    1.03024   0.332 0.746069    
## V18   0.28867    0.98476   0.293 0.774871    
## V19   0.87147    1.10502   0.789 0.446980    
## V20   0.77925    1.05287   0.740 0.474726    
## V21   1.71454    1.16652   1.470 0.169630    
## V22  -1.81793    1.10588  -1.644 0.128450    
## V23   0.93086    1.21009   0.769 0.457947    
## V24  -1.97274    1.13907  -1.732 0.111202    
## V25  -1.05248    1.22995  -0.856 0.410408    
## V26  -0.01229    1.14646  -0.011 0.991639    
## V27   1.34061    1.21901   1.100 0.294925    
## V28  -0.82839    1.12086  -0.739 0.475340    
## V29   6.43951    9.01735   0.714 0.490027    
## V30 -23.17077    5.59364  -4.142 0.001638 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3638 on 11 degrees of freedom
## Multiple R-squared:  0.992,  Adjusted R-squared:  0.9703 
## F-statistic: 45.62 on 30 and 11 DF,  p-value: 5.639e-08
coefplot::coefplot(ThinPlateSplines_smooth_overfit_fit) + theme_bw()

ThinPlateSplines_smooth_overfit_fit %>% glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.992         0.970 0.364      45.6 5.64e-8    30   10.3  41.5  94.6
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Prediction

ThinPlateSplines_overfit_test_basis_df <- PredictMat(ThinPlateSplines_smooth_overfit, fine_df_1d) %>% as.data.frame() %>% tbl_df()
ThinPlateSplines_overfit_test_pred <- predict(ThinPlateSplines_smooth_overfit_fit, newdata = ThinPlateSplines_overfit_test_basis_df)
fine_df_1d %>%
  mutate(y_pred = ThinPlateSplines_overfit_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

#Using regularization

minimize_error_func <- function(beta, my_info){
  basis_mat = my_info$basis_mat
  
  response_vec = my_info$yobs
  
  lambda = my_info$lambda
  
  penalty_mat = my_info$penalty_mat
  
  linear_pred = as.vector(basis_mat %*% as.matrix(beta))
  
  sum_of_squares = sum((response_vec - linear_pred) ^ 2)
  
  penalty = t(as.matrix(beta)) %*% (penalty_mat %*% as.matrix(beta))
  
  objective = sum_of_squares + lambda * penalty

  objective  
}
ThinPlateSplines_fit_func_Reg <- function(df_1, data, lambda){
  Smooth <- smoothCon(s(x1, k=df_1, bs = 'ts'), data = data)[[1]]
  Smooth_basis <- Smooth$X
  Smooth_penalty <- Smooth$S
  
  info_list <- list(
    yobs = data$y, 
    basis_mat = as.matrix(Smooth_basis %>% as.data.frame() %>% tbl_df()),
    penalty_mat = as.matrix(Smooth_penalty %>% as.data.frame() %>% tbl_df()),
    lambda = lambda
  )
  
  start_guess = rnorm(n = ncol(info_list$basis_mat), mean = 0, sd = 2)
  
  fit <- optim(start_guess,
               minimize_error_func,
               gr = NULL,
               info_list,
               method = "BFGS",
               hessian = TRUE,
               control = list(fnscale = 1, maxit = 1001))
  
  return(list("fit" = fit, "Smooth" = Smooth))
}
ThinPlateSplines_reg <- ThinPlateSplines_fit_func_Reg(25, train_df_1d, 0.0001)
#ThinPlateSplines_reg$fit
ThinPlateSplines_reg_test_basis_df <- PredictMat(ThinPlateSplines_reg$Smooth, fine_df_1d) %>% as.data.frame() %>% tbl_df()
ThinPlateSplines_reg_test_pred <- as.matrix(ThinPlateSplines_reg_test_basis_df)  %*% as.matrix(ThinPlateSplines_reg$fit$par)
# ThinPlateSplines_reg_test_pred
fine_df_1d %>%
  mutate(y_pred = ThinPlateSplines_reg_test_pred) %>%
  mutate(y_pred_unreg = ThinPlateSplines_overfit_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_line(mapping = aes(y = y_pred_unreg), color = 'blue') +
  
  geom_point(mapping = aes(y = y), color = "red") +
  
  theme_bw()

Penalized Cubic Regression splines

CubicSplines_fit_func_No_Reg <- function(df_1, data){
  model <- lm(y ~ smoothCon(s(x1, k=df_1, bs = 'cc'), data = data)[[1]]$X - 1, data = data)
  metrics <- glance(model) %>% mutate(df_1 = df_1)
  return(metrics)
}
df_grid_cubic<-  list(x1 = c(5:18)) %>% as.data.frame() %>% tbl_df()
df_grid_cubic
## # A tibble: 14 x 1
##       x1
##    <int>
##  1     5
##  2     6
##  3     7
##  4     8
##  5     9
##  6    10
##  7    11
##  8    12
##  9    13
## 10    14
## 11    15
## 12    16
## 13    17
## 14    18
CubicSplines_comp_fit <- map_dfr(df_grid_cubic$x1, CubicSplines_fit_func_No_Reg, data = train_df_1d)
CubicSplines_comp_fit
## # A tibble: 14 x 13
##    r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##        <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
##  1     0.700         0.668 1.22       21.6 2.97e- 9     4  -64.1 138.  147. 
##  2     0.800         0.772 1.01       28.7 1.26e-11     5  -55.8 124.  134. 
##  3     0.882         0.862 0.785      43.5 8.39e-15     6  -45.0 104.  116. 
##  4     0.926         0.910 0.632      60.4 2.52e-17     7  -35.5  87.1 101. 
##  5     0.951         0.939 0.520      80.2 2.16e-19     8  -26.9  71.9  87.3
##  6     0.951         0.937 0.529      68.9 2.14e-18     9  -27.0  74.0  91.1
##  7     0.950         0.934 0.543      58.8 2.49e-17    10  -27.4  76.8  95.7
##  8     0.952         0.935 0.539      54.4 9.76e-17    11  -26.4  76.9  97.4
##  9     0.955         0.936 0.532      51.3 3.23e-16    12  -25.2  76.5  98.7
## 10     0.959         0.940 0.516      50.7 6.10e-16    13  -23.2  74.4  98.4
## 11     0.964         0.945 0.494      51.5 8.83e-16    14  -20.7  71.4  97.1
## 12     0.963         0.942 0.507      45.7 7.69e-15    15  -21.0  73.9 101. 
## 13     0.965         0.942 0.507      42.8 3.43e-14    16  -20.2  74.4 104. 
## 14     0.966         0.942 0.508      40.2 1.47e-13    17  -19.4  74.9 106. 
## # … with 4 more variables: deviance <dbl>, df.residual <int>, nobs <int>,
## #   df_1 <int>
df_best_CubicSplines_AIC <- CubicSplines_comp_fit %>% filter(AIC == min(AIC)) %>% select(df_1)
df_best_CubicSplines_AIC
## # A tibble: 1 x 1
##    df_1
##   <int>
## 1    15
CubicSplines_smooth_Best_AIC = smoothCon(s(x1, k=df_best_CubicSplines_AIC$df_1, bs = 'cc'), data = train_df_1d)[[1]]
CubicSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 14
##          V1      V2      V3      V4        V5       V6       V7        V8
##       <dbl>   <dbl>   <dbl>   <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1  1.00     0       0       0      -1.73e-18  0.       0.       5.42e-20
##  2  0.782    0.401  -0.0959  0.0257 -6.88e- 3  1.84e-3 -4.81e-4  8.21e- 5
##  3  0.334    0.835  -0.132   0.0353 -9.46e- 3  2.53e-3 -6.71e-4  1.51e- 4
##  4 -0.0368   0.995   0.0435 -0.0116  3.12e- 3 -8.35e-4  2.23e-4 -5.63e- 5
##  5 -0.136    0.725   0.468  -0.108   2.90e- 2 -7.77e-3  2.08e-3 -5.44e- 4
##  6 -0.0680   0.269   0.881  -0.123   3.30e- 2 -8.84e-3  2.37e-3 -6.28e- 4
##  7  0.0180  -0.0671  0.979   0.0931 -2.47e- 2  6.61e-3 -1.77e-3  4.73e- 4
##  8  0.0357  -0.133   0.664   0.535  -1.19e- 1  3.18e-2 -8.53e-3  2.28e- 3
##  9  0.0143  -0.0533  0.207   0.922  -1.10e- 1  2.94e-2 -7.88e-3  2.11e- 3
## 10 -0.00655  0.0244 -0.0912  0.955   1.48e- 1 -3.87e-2  1.04e-2 -2.78e- 3
## # … with 31 more rows, and 6 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>,
## #   V12 <dbl>, V13 <dbl>, V14 <dbl>
CubicSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble() %>%
  mutate(x1 = train_df_1d$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")

CubicSplines_train_df <- CubicSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
CubicSplines_smooth_Best_AIC_fit <- lm(y ~ . - 1, data = CubicSplines_train_df)
CubicSplines_smooth_Best_AIC_fit %>% summary()
## 
## Call:
## lm(formula = y ~ . - 1, data = CubicSplines_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.40398 -0.19270  0.03066  0.25443  1.17067 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## V1   -0.6223     0.2688  -2.315 0.028439 *  
## V2   -1.0236     0.3195  -3.204 0.003464 ** 
## V3   -3.1432     0.3202  -9.816 2.11e-10 ***
## V4   -3.2375     0.3206 -10.097 1.15e-10 ***
## V5   -2.8455     0.3207  -8.872 1.73e-09 ***
## V6    0.7397     0.3206   2.307 0.028951 *  
## V7    2.2828     0.3204   7.124 1.16e-07 ***
## V8    3.2251     0.3203  10.068 1.23e-10 ***
## V9    1.8088     0.3204   5.645 5.42e-06 ***
## V10   0.1083     0.3206   0.338 0.738075    
## V11  -1.8078     0.3207  -5.636 5.55e-06 ***
## V12  -1.2063     0.3206  -3.762 0.000827 ***
## V13   2.1297     0.3202   6.651 3.88e-07 ***
## V14   1.9278     0.3195   6.034 1.93e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4939 on 27 degrees of freedom
## Multiple R-squared:  0.9639, Adjusted R-squared:  0.9452 
## F-statistic: 51.53 on 14 and 27 DF,  p-value: 8.825e-16
coefplot::coefplot(CubicSplines_smooth_Best_AIC_fit) + theme_bw()

CubicSplines_smooth_Best_AIC_fit %>% glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.964         0.945 0.494      51.5 8.83e-16    14  -20.7  71.4  97.1
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Prediction

CubicSplines_test_basis_df <- PredictMat(CubicSplines_smooth_Best_AIC, fine_df_1d) %>% as.data.frame() %>% tbl_df()
CubicSplines_test_pred <- predict(CubicSplines_smooth_Best_AIC_fit, newdata = CubicSplines_test_basis_df)
fine_df_1d %>%
  mutate(y_pred = CubicSplines_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

Overfitted cubicSplines model

CubicSplines_smooth_overfit = smoothCon(s(x1, k=30, bs = 'cc'), data = train_df_1d)[[1]]
CubicSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 29
##          V1        V2        V3        V4       V5        V6        V7       V8
##       <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>     <dbl>    <dbl>
##  1  1.00e+0  5.55e-17 -2.78e-17  6.94e-18  0        4.34e-19 -1.08e-19  0.     
##  2  3.01e-1  8.59e- 1 -1.28e- 1  3.43e- 2 -0.00919  2.46e- 3 -6.60e- 4  1.77e-4
##  3 -1.33e-1  6.64e- 1  5.35e- 1 -1.19e- 1  0.0318  -8.53e- 3  2.29e- 3 -6.13e-4
##  4  2.71e-2 -1.01e- 1  9.39e- 1  1.77e- 1 -0.0460   1.23e- 2 -3.30e- 3  8.84e-4
##  5  6.61e-3 -2.47e- 2  9.31e- 2  9.79e- 1 -0.0671   1.80e- 2 -4.81e- 3  1.29e-3
##  6 -7.34e-3  2.74e- 2 -1.02e- 1  4.34e- 1  0.754   -1.37e- 1  3.66e- 2 -9.81e-3
##  7  2.62e-3 -9.77e- 3  3.64e- 2 -1.36e- 1  0.782    4.01e- 1 -9.59e- 2  2.57e-2
##  8 -2.72e-4  1.01e- 3 -3.78e- 3  1.41e- 2 -0.0527   9.88e- 1  6.76e- 2 -1.80e-2
##  9 -2.75e-4  1.03e- 3 -3.83e- 3  1.43e- 2 -0.0533   2.07e- 1  9.22e- 1 -1.10e-1
## 10  1.70e-4 -6.36e- 4  2.37e- 3 -8.86e- 3  0.0331  -1.23e- 1  5.68e- 1  6.33e-1
## # … with 31 more rows, and 21 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>,
## #   V12 <dbl>, V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>,
## #   V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>,
## #   V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>
CubicSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble() %>%
  mutate(x1 = train_df_1d$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")

CubicSplines_overfit_train_df <- CubicSplines_smooth_overfit$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
CubicSplines_smooth_overfit_fit <- lm(y ~ . - 1, data = CubicSplines_overfit_train_df)
CubicSplines_smooth_overfit_fit %>% summary()
## 
## Call:
## lm(formula = y ~ . - 1, data = CubicSplines_overfit_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28031 -0.16479  0.00262  0.14720  1.29435 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## V1   -0.7459     0.4305  -1.733 0.108716    
## V2   -0.5421     0.5904  -0.918 0.376613    
## V3   -0.8982     0.5900  -1.522 0.153857    
## V4   -2.4369     0.5887  -4.140 0.001372 ** 
## V5   -2.5162     0.5937  -4.238 0.001152 ** 
## V6   -3.6199     0.5884  -6.152 4.93e-05 ***
## V7   -3.2820     0.5918  -5.545 0.000127 ***
## V8   -3.1460     0.5929  -5.306 0.000186 ***
## V9   -3.1827     0.5875  -5.417 0.000156 ***
## V10  -1.8287     0.5936  -3.081 0.009520 ** 
## V11   0.3431     0.5905   0.581 0.571998    
## V12   1.1930     0.5898   2.023 0.065974 .  
## V13   2.3816     0.5937   4.011 0.001727 ** 
## V14   2.3285     0.5879   3.961 0.001890 ** 
## V15   3.4232     0.5924   5.778 8.76e-05 ***
## V16   2.8749     0.5924   4.853 0.000396 ***
## V17   2.4742     0.5879   4.209 0.001213 ** 
## V18   1.3457     0.5937   2.266 0.042715 *  
## V19   0.7628     0.5898   1.293 0.220262    
## V20  -0.4784     0.5905  -0.810 0.433659    
## V21  -0.8730     0.5936  -1.471 0.167087    
## V22  -2.2080     0.5875  -3.758 0.002729 ** 
## V23  -1.9631     0.5929  -3.311 0.006213 ** 
## V24  -0.5647     0.5918  -0.954 0.358875    
## V25   0.4873     0.5884   0.828 0.423691    
## V26   2.3337     0.5937   3.931 0.001997 ** 
## V27   2.7928     0.5887   4.744 0.000477 ***
## V28   1.6950     0.5900   2.873 0.014017 *  
## V29   0.5219     0.5904   0.884 0.394070    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6327 on 12 degrees of freedom
## Multiple R-squared:  0.9737, Adjusted R-squared:  0.9101 
## F-statistic: 15.31 on 29 and 12 DF,  p-value: 7.934e-06
coefplot::coefplot(CubicSplines_smooth_overfit_fit) + theme_bw()

CubicSplines_smooth_overfit_fit %>% glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.974         0.910 0.633      15.3 7.93e-6    29  -14.2  88.4  140.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Prediction

CubicSplines_overfit_test_basis_df <- PredictMat(CubicSplines_smooth_overfit, fine_df_1d) %>% as.data.frame() %>% tbl_df()
CubicSplines_overfit_test_pred <- predict(CubicSplines_smooth_overfit_fit, newdata = CubicSplines_overfit_test_basis_df)
fine_df_1d %>%
  mutate(y_pred = CubicSplines_overfit_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

#Using regularization

CubicSplines_fit_func_Reg <- function(df_1, data, lambda){
  Smooth <- smoothCon(s(x1, k=df_1, bs = 'cc'), data = data)[[1]]
  Smooth_basis <- Smooth$X
  Smooth_penalty <- Smooth$S
  
  info_list <- list(
    yobs = data$y, 
    basis_mat = as.matrix(Smooth_basis %>% as.data.frame() %>% tbl_df()),
    penalty_mat = as.matrix(Smooth_penalty %>% as.data.frame() %>% tbl_df()),
    lambda = lambda
  )
  
  start_guess = rnorm(n = ncol(info_list$basis_mat), mean = 0, sd = 2)
  
  fit <- optim(start_guess,
               minimize_error_func,
               gr = NULL,
               info_list,
               method = "BFGS",
               hessian = TRUE,
               control = list(fnscale = 1, maxit = 1001))
  
  return(list("fit" = fit, "Smooth" = Smooth))
}
CubicSplines_reg <- CubicSplines_fit_func_Reg(30, train_df_1d, 2.0)
CubicSplines_reg_test_basis_df <- PredictMat(CubicSplines_reg$Smooth, fine_df_1d) %>% as.data.frame() %>% tbl_df()
CubicSplines_reg_test_pred <- as.matrix(CubicSplines_reg_test_basis_df)  %*% as.matrix(CubicSplines_reg$fit$par)
fine_df_1d %>%
  mutate(y_pred = CubicSplines_reg_test_pred) %>%
  mutate(y_pred_unreg = CubicSplines_overfit_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_line(mapping = aes(y = y_pred_unreg), color = 'blue') +
  
  geom_point(mapping = aes(y = y), color = "red") +
  
  theme_bw()

Tensor Smooth splines

TensorSplines_fit_func_No_Reg <- function(df_1, data){
  model <- lm(y ~ smoothCon(ti(x1, k=df_1), data = data)[[1]]$X - 1, data = data)
  metrics <- glance(model) %>% mutate(df_1 = df_1)
  return(metrics)
}
df_grid_tensor<-  list(x1 = c(5:18)) %>% as.data.frame() %>% tbl_df()
df_grid_tensor
## # A tibble: 14 x 1
##       x1
##    <int>
##  1     5
##  2     6
##  3     7
##  4     8
##  5     9
##  6    10
##  7    11
##  8    12
##  9    13
## 10    14
## 11    15
## 12    16
## 13    17
## 14    18
TensorSplines_comp_fit <- map_dfr(df_grid_tensor$x1, TensorSplines_fit_func_No_Reg, data = train_df_1d)
TensorSplines_comp_fit
## # A tibble: 14 x 13
##    r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##        <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
##  1     0.648         0.610 1.32       17.1 5.20e- 8     4 -67.4  145.  153. 
##  2     0.712         0.672 1.21       17.8 7.43e- 9     5 -63.3  139.  149. 
##  3     0.873         0.851 0.814      40.1 2.87e-14     6 -46.5  107.  119. 
##  4     0.939         0.926 0.574      74.2 1.00e-18     7 -31.6   79.2  92.9
##  5     0.970         0.963 0.405     135.  6.08e-23     8 -16.7   51.4  66.8
##  6     0.972         0.964 0.399     124.  2.82e-22     9 -15.5   50.9  68.1
##  7     0.972         0.963 0.407     107.  3.63e-21    10 -15.6   53.2  72.1
##  8     0.973         0.964 0.402      99.8 1.67e-20    11 -14.5   52.9  73.5
##  9     0.974         0.963 0.405      90.2 1.30e-19    12 -14.0   54.1  76.4
## 10     0.975         0.964 0.401      85.4 5.67e-19    13 -12.9   53.7  77.7
## 11     0.978         0.967 0.383      87.0 9.89e-19    14 -10.3   50.5  76.2
## 12     0.977         0.964 0.400      74.3 1.81e-17    15 -11.3   54.6  82.0
## 13     0.978         0.964 0.398      70.6 8.51e-17    16 -10.2   54.5  83.6
## 14     0.980         0.965 0.395      67.5 3.78e-16    17  -9.08  54.2  85.0
## # … with 4 more variables: deviance <dbl>, df.residual <int>, nobs <int>,
## #   df_1 <int>
df_best_TensorSplines_AIC <- TensorSplines_comp_fit %>% filter(AIC == min(AIC)) %>% select(df_1)
df_best_TensorSplines_AIC
## # A tibble: 1 x 1
##    df_1
##   <int>
## 1    15
TensorSplines_smooth_Best_AIC = smoothCon(ti(x1, k=df_best_TensorSplines_AIC$df_1), data = train_df_1d)[[1]]
TensorSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 14
##         V1      V2      V3      V4      V5      V6      V7      V8      V9
##      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 -0.298  -0.258  -0.269  -0.266  -0.267  -0.266  -0.267  -0.266  -0.267 
##  2  0.333  -0.309  -0.148  -0.191  -0.180  -0.183  -0.182  -0.182  -0.182 
##  3  0.794  -0.261  -0.0701 -0.121  -0.107  -0.111  -0.110  -0.110  -0.110 
##  4  0.913  -0.0160 -0.0770 -0.0607 -0.0651 -0.0639 -0.0642 -0.0641 -0.0642
##  5  0.633   0.430  -0.161  -0.0195 -0.0573 -0.0472 -0.0499 -0.0492 -0.0494
##  6  0.191   0.835  -0.178  -0.0200 -0.0624 -0.0511 -0.0541 -0.0533 -0.0535
##  7 -0.131   0.919   0.0316 -0.0860 -0.0547 -0.0631 -0.0609 -0.0614 -0.0613
##  8 -0.195   0.600   0.471  -0.183  -0.0320 -0.0724 -0.0616 -0.0644 -0.0637
##  9 -0.120   0.145   0.859  -0.173  -0.0335 -0.0708 -0.0608 -0.0635 -0.0628
## 10 -0.0461 -0.150   0.892   0.0865 -0.100  -0.0511 -0.0643 -0.0608 -0.0618
## # … with 31 more rows, and 5 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>,
## #   V13 <dbl>, V14 <dbl>
TensorSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble() %>%
  mutate(x1 = train_df_1d$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")

TensorSplines_train_df <- TensorSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
TensorSplines_smooth_Best_AIC_fit <- lm(y ~ . - 1, data = TensorSplines_train_df)
TensorSplines_smooth_Best_AIC_fit %>% summary()
## 
## Call:
## lm(formula = y ~ . - 1, data = TensorSplines_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57797 -0.32042 -0.15595  0.09521  0.44434 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## V1   -1.2384     0.2717  -4.557 0.000100 ***
## V2   -2.9956     0.2451 -12.220 1.64e-12 ***
## V3   -3.2860     0.2605 -12.613 7.87e-13 ***
## V4   -2.7893     0.2538 -10.991 1.80e-11 ***
## V5    0.7408     0.2575   2.876 0.007762 ** 
## V6    2.3118     0.2554   9.050 1.15e-09 ***
## V7    3.2420     0.2564  12.642 7.46e-13 ***
## V8    1.8272     0.2559   7.139 1.12e-07 ***
## V9    0.1353     0.2564   0.528 0.601943    
## V10  -1.8030     0.2561  -7.040 1.44e-07 ***
## V11  -1.1571     0.2561  -4.519 0.000111 ***
## V12   2.0935     0.2535   8.259 7.24e-09 ***
## V13   2.0753     0.2538   8.177 8.80e-09 ***
## V14  -1.8283     0.3359  -5.444 9.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3829 on 27 degrees of freedom
## Multiple R-squared:  0.9783, Adjusted R-squared:  0.9671 
## F-statistic: 87.04 on 14 and 27 DF,  p-value: < 2.2e-16
coefplot::coefplot(TensorSplines_smooth_Best_AIC_fit) + theme_bw()

TensorSplines_smooth_Best_AIC_fit %>% glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.978         0.967 0.383      87.0 9.89e-19    14  -10.3  50.5  76.2
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Prediction

TensorSplines_test_basis_df <- PredictMat(TensorSplines_smooth_Best_AIC, fine_df_1d) %>% as.data.frame() %>% tbl_df()
TensorSplines_test_pred <- predict(TensorSplines_smooth_Best_AIC_fit, newdata = TensorSplines_test_basis_df)
fine_df_1d %>%
  mutate(y_pred = TensorSplines_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

Overfitted TensorSplines model

TensorSplines_smooth_overfit = smoothCon(ti(x1, k=30), data = train_df_1d)[[1]]
TensorSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 29
##          V1       V2      V3       V4      V5       V6       V7       V8
##       <dbl>    <dbl>   <dbl>    <dbl>   <dbl>    <dbl>    <dbl>    <dbl>
##  1 -0.195   -0.181   -0.186  -0.182   -0.186  -0.184   -0.183   -1.86e-1
##  2  0.871   -0.207   -0.0205 -0.0695  -0.0571 -0.0602  -0.0590  -6.01e-2
##  3  0.609    0.526   -0.140   0.0144  -0.0272 -0.0160  -0.0189  -1.84e-2
##  4 -0.127    0.906    0.146  -0.0770  -0.0192 -0.0345  -0.0302  -3.18e-2
##  5 -0.0551   0.0629   0.949  -0.0971  -0.0126 -0.0351  -0.0288  -3.10e-2
##  6 -0.00510 -0.130    0.405   0.726   -0.166   0.00791 -0.0384  -2.64e-2
##  7 -0.0403   0.00733 -0.166   0.753    0.371  -0.125   -0.00360 -3.66e-2
##  8 -0.0306  -0.0329  -0.0159 -0.0821   0.958   0.0380  -0.0475  -2.52e-2
##  9 -0.0303  -0.0328  -0.0155 -0.0825   0.177   0.892   -0.139   -3.61e-4
## 10 -0.0316  -0.0264  -0.0384  0.00415 -0.153   0.539    0.604   -1.60e-1
## # … with 31 more rows, and 21 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>,
## #   V12 <dbl>, V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>,
## #   V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>,
## #   V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>
TensorSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble() %>%
  mutate(x1 = train_df_1d$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")

TensorSplines_overfit_train_df <- TensorSplines_smooth_overfit$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
TensorSplines_smooth_overfit_fit <- lm(y ~ . - 1, data = TensorSplines_overfit_train_df)
TensorSplines_smooth_overfit_fit %>% summary()
## 
## Call:
## lm(formula = y ~ . - 1, data = TensorSplines_overfit_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52981 -0.26919 -0.12846 -0.01008  0.22824 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## V1   -0.6999     0.3805  -1.839 0.090740 .  
## V2   -0.7736     0.3786  -2.043 0.063622 .  
## V3   -2.4719     0.3873  -6.382 3.49e-05 ***
## V4   -2.4624     0.3876  -6.353 3.65e-05 ***
## V5   -3.6148     0.3865  -9.353 7.34e-07 ***
## V6   -3.2511     0.3877  -8.386 2.31e-06 ***
## V7   -3.1281     0.3888  -8.045 3.55e-06 ***
## V8   -3.1593     0.3853  -8.200 2.92e-06 ***
## V9   -1.8072     0.3891  -4.644 0.000566 ***
## V10   0.3655     0.3872   0.944 0.363856    
## V11   1.2140     0.3868   3.139 0.008553 ** 
## V12   2.4048     0.3892   6.179 4.73e-05 ***
## V13   2.3493     0.3856   6.093 5.40e-05 ***
## V14   3.4457     0.3884   8.872 1.28e-06 ***
## V15   2.8972     0.3884   7.460 7.64e-06 ***
## V16   2.4953     0.3856   6.471 3.06e-05 ***
## V17   1.3685     0.3892   3.516 0.004252 ** 
## V18   0.7844     0.3868   2.028 0.065355 .  
## V19  -0.4571     0.3872  -1.180 0.260737    
## V20  -0.8494     0.3891  -2.183 0.049611 *  
## V21  -2.1884     0.3854  -5.679 0.000103 ***
## V22  -1.9381     0.3887  -4.987 0.000316 ***
## V23  -0.5470     0.3880  -1.410 0.184038    
## V24   0.5165     0.3858   1.339 0.205501    
## V25   2.3413     0.3888   6.021 6.02e-05 ***
## V26   2.8418     0.3850   7.380 8.50e-06 ***
## V27   1.6678     0.3826   4.359 0.000930 ***
## V28   0.6291     0.3729   1.687 0.117384    
## V29  -2.0131     0.4079  -4.936 0.000345 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.413 on 12 degrees of freedom
## Multiple R-squared:  0.9888, Adjusted R-squared:  0.9617 
## F-statistic:  36.5 on 29 and 12 DF,  p-value: 5.667e-08
coefplot::coefplot(TensorSplines_smooth_overfit_fit) + theme_bw()

TensorSplines_smooth_overfit_fit %>% glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.989         0.962 0.413      36.5 5.67e-8    29   3.27  53.5  105.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Prediction

TensorSplines_overfit_test_basis_df <- PredictMat(TensorSplines_smooth_overfit, fine_df_1d) %>% as.data.frame() %>% tbl_df()
TensorSplines_overfit_test_pred <- predict(TensorSplines_smooth_overfit_fit, newdata = TensorSplines_overfit_test_basis_df)
fine_df_1d %>%
  mutate(y_pred = TensorSplines_overfit_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

#Using regularization

TensorSplines_fit_func_Reg <- function(df_1, data, lambda){
  Smooth <- smoothCon(ti(x1, k=df_1), data = data)[[1]]
  Smooth_basis <- Smooth$X
  Smooth_penalty <- Smooth$S
  
  info_list <- list(
    yobs = data$y, 
    basis_mat = as.matrix(Smooth_basis %>% as.data.frame() %>% tbl_df()),
    penalty_mat = as.matrix(Smooth_penalty %>% as.data.frame() %>% tbl_df()),
    lambda = lambda
  )
  
  start_guess = rnorm(n = ncol(info_list$basis_mat), mean = 0, sd = 2)
  
  fit <- optim(start_guess,
               minimize_error_func,
               gr = NULL,
               info_list,
               method = "BFGS",
               hessian = TRUE,
               control = list(fnscale = 1, maxit = 1001))
  
  return(list("fit" = fit, "Smooth" = Smooth))
}
TensorSplines_reg <- TensorSplines_fit_func_Reg(30, train_df_1d, 1.0)
TensorSplines_reg_test_basis_df <- PredictMat(TensorSplines_reg$Smooth, fine_df_1d) %>% as.data.frame() %>% tbl_df()
TensorSplines_reg_test_pred <- as.matrix(TensorSplines_reg_test_basis_df)  %*% as.matrix(TensorSplines_reg$fit$par)
fine_df_1d %>%
  mutate(y_pred = TensorSplines_reg_test_pred) %>%
  mutate(y_pred_unreg = TensorSplines_overfit_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_line(mapping = aes(y = y_pred_unreg), color = 'blue') +
  
  geom_point(mapping = aes(y = y), color = "red") +
  
  theme_bw()

P-Splines Smooths

PSplines_fit_func_No_Reg <- function(df_1, data){
  model <- lm(y ~ smoothCon(s(x1, k=df_1, bs = 'ps'), data = data)[[1]]$X - 1, data = data)
  metrics <- glance(model) %>% mutate(df_1 = df_1)
  return(metrics)
}
df_grid_psplines<-  list(x1 = c(5:18)) %>% as.data.frame() %>% tbl_df()
df_grid_psplines
## # A tibble: 14 x 1
##       x1
##    <int>
##  1     5
##  2     6
##  3     7
##  4     8
##  5     9
##  6    10
##  7    11
##  8    12
##  9    13
## 10    14
## 11    15
## 12    16
## 13    17
## 14    18
PSplines_comp_fit <- map_dfr(df_grid_psplines$x1, PSplines_fit_func_No_Reg, data = train_df_1d)
PSplines_comp_fit
## # A tibble: 14 x 13
##    r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##        <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
##  1     0.522         0.455 1.56       7.85 4.43e- 5     5 -73.7  159.  170. 
##  2     0.593         0.524 1.46       8.51 9.95e- 6     6 -70.4  155.  167. 
##  3     0.966         0.959 0.426    139.   3.96e-23     7 -19.3   54.6  68.3
##  4     0.948         0.936 0.534     75.8  5.25e-19     8 -28.0   74.1  89.5
##  5     0.952         0.939 0.523     70.7  1.43e-18     9 -26.5   73.0  90.1
##  6     0.964         0.953 0.457     84.1  1.30e-19    10 -20.4   62.8  81.6
##  7     0.977         0.968 0.377    114.   2.45e-21    11 -11.8   47.6  68.2
##  8     0.976         0.966 0.389     97.9  4.14e-20    12 -12.4   50.8  73.1
##  9     0.976         0.965 0.397     86.8  4.52e-19    13 -12.5   53.0  77.0
## 10     0.977         0.966 0.392     83.0  1.84e-18    14 -11.2   52.4  78.1
## 11     0.978         0.965 0.395     76.2  1.31e-17    15 -10.8   53.5  81.0
## 12     0.980         0.967 0.385     75.5  3.77e-17    16  -8.88  51.8  80.9
## 13     0.983         0.971 0.357     82.9  3.46e-17    17  -4.95  45.9  76.7
## 14     0.982         0.969 0.373     71.5  5.61e-16    18  -5.94  49.9  82.4
## # … with 4 more variables: deviance <dbl>, df.residual <int>, nobs <int>,
## #   df_1 <int>
df_best_PSplines_AIC <- PSplines_comp_fit %>% filter(AIC == min(AIC)) %>% select(df_1)
df_best_PSplines_AIC
## # A tibble: 1 x 1
##    df_1
##   <int>
## 1    17
PSplines_smooth_Best_AIC = smoothCon(s(x1, k=df_best_PSplines_AIC$df_1, bs = 'ps'), data = train_df_1d)[[1]]
PSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 17
##         V1      V2      V3      V4      V5      V6      V7    V8    V9   V10
##      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl>
##  1 0.160   0.666   0.174   4.55e-7 0.      0.      0.          0     0     0
##  2 0.0430  0.559   0.390   7.99e-3 0.      0.      0.          0     0     0
##  3 0.00396 0.340   0.596   6.03e-2 0.      0.      0.          0     0     0
##  4 0       0.138   0.663   1.99e-1 3.95e-5 0.      0.          0     0     0
##  5 0       0.0340  0.532   4.22e-1 1.16e-2 0.      0.          0     0     0
##  6 0       0.00229 0.308   6.16e-1 7.33e-2 0.      0.          0     0     0
##  7 0       0       0.118   6.55e-1 2.27e-1 2.21e-4 0.          0     0     0
##  8 0       0       0.0264  5.04e-1 4.53e-1 1.61e-2 0.          0     0     0
##  9 0       0       0.00117 2.77e-1 6.33e-1 8.80e-2 0.          0     0     0
## 10 0       0       0       9.96e-2 6.44e-1 2.56e-1 6.53e-4     0     0     0
## # … with 31 more rows, and 7 more variables: V11 <dbl>, V12 <dbl>, V13 <dbl>,
## #   V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>
PSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble() %>%
  mutate(x1 = train_df_1d$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")

PSplines_train_df <- PSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
PSplines_smooth_Best_AIC_fit <- lm(y ~ . - 1, data = PSplines_train_df)
PSplines_smooth_Best_AIC_fit %>% summary()
## 
## Call:
## lm(formula = y ~ . - 1, data = PSplines_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56602 -0.16323  0.02515  0.21155  0.55547 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## V1    7.2483     5.3878   1.345 0.191101    
## V2   -0.8174     1.2022  -0.680 0.503044    
## V3   -0.5579     0.6497  -0.859 0.399076    
## V4   -3.7147     0.5208  -7.132 2.26e-07 ***
## V5   -3.0047     0.4850  -6.195 2.11e-06 ***
## V6   -3.8905     0.4746  -8.198 2.04e-08 ***
## V7    1.5287     0.4712   3.244 0.003453 ** 
## V8    2.0734     0.4700   4.411 0.000186 ***
## V9    3.9005     0.4696   8.305 1.62e-08 ***
## V10   1.6545     0.4700   3.520 0.001752 ** 
## V11   0.3186     0.4712   0.676 0.505457    
## V12  -2.3176     0.4746  -4.884 5.58e-05 ***
## V13  -1.9595     0.4850  -4.040 0.000476 ***
## V14   3.0821     0.5208   5.918 4.17e-06 ***
## V15   2.4674     0.6497   3.797 0.000878 ***
## V16  -1.1288     1.2022  -0.939 0.357113    
## V17 -10.5723     5.3878  -1.962 0.061428 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3569 on 24 degrees of freedom
## Multiple R-squared:  0.9833, Adjusted R-squared:  0.9714 
## F-statistic: 82.93 on 17 and 24 DF,  p-value: < 2.2e-16
coefplot::coefplot(PSplines_smooth_Best_AIC_fit) + theme_bw()

PSplines_smooth_Best_AIC_fit %>% glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.983         0.971 0.357      82.9 3.46e-17    17  -4.95  45.9  76.7
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Prediction

PSplines_test_basis_df <- PredictMat(PSplines_smooth_Best_AIC, fine_df_1d) %>% as.data.frame() %>% tbl_df()
PSplines_test_pred <- predict(PSplines_smooth_Best_AIC_fit, newdata = PSplines_test_basis_df)
fine_df_1d %>%
  mutate(y_pred = PSplines_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

Overfitted PSplines model

PSplines_smooth_overfit = smoothCon(s(x1, k=30, bs = 'ps'), data = train_df_1d)[[1]]
PSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 30
##         V1     V2      V3      V4      V5      V6     V7      V8     V9     V10
##      <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
##  1 0.154   0.666  0.180   3.26e-6 0       0.      0      0.      0      0.     
##  2 0.00447 0.348  0.590   5.73e-2 0       0.      0      0.      0      0.     
##  3 0       0.0408 0.553   3.98e-1 0.00874 0.      0      0.      0      0.     
##  4 0       0      0.144   6.64e-1 0.192   1.83e-5 0      0.      0      0.     
##  5 0       0      0.00360 3.34e-1 0.600   6.26e-2 0      0.      0      0.     
##  6 0       0      0       3.69e-2 0.541   4.12e-1 0.0103 0.      0      0.     
##  7 0       0      0       0.      0.135   6.62e-1 0.203  5.44e-5 0      0.     
##  8 0       0      0       0.      0.00285 3.20e-1 0.609  6.82e-2 0      0.     
##  9 0       0      0       0.      0       3.32e-2 0.530  4.25e-1 0.0120 0.     
## 10 0       0      0       0.      0       0.      0.126  6.59e-1 0.215  1.21e-4
## # … with 31 more rows, and 20 more variables: V11 <dbl>, V12 <dbl>, V13 <dbl>,
## #   V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>,
## #   V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>,
## #   V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>
PSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble() %>%
  mutate(x1 = train_df_1d$x1) %>% 
  tibble::rowid_to_column() %>% 
  tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
  ggplot(mapping = aes(x = x1, y = value)) +
  geom_line(mapping = aes(color = key),
            size = 1.15) +
  theme_bw() +
  theme(legend.position = "top")

PSplines_overfit_train_df <- PSplines_smooth_overfit$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
PSplines_smooth_overfit_fit <- lm(y ~ . - 1, data = PSplines_overfit_train_df)
PSplines_smooth_overfit_fit %>% summary()
## 
## Call:
## lm(formula = y ~ . - 1, data = PSplines_overfit_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38928 -0.15212 -0.00088  0.14416  0.36534 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## V1   6.13570    9.18901   0.668 0.518075    
## V2  -0.42201    2.29614  -0.184 0.857522    
## V3  -0.62361    1.12009  -0.557 0.588849    
## V4  -0.67375    0.79853  -0.844 0.416787    
## V5  -3.09608    0.70974  -4.362 0.001132 ** 
## V6  -2.15893    0.67808  -3.184 0.008701 ** 
## V7  -4.56984    0.67359  -6.784 3.02e-05 ***
## V8  -2.34446    0.66831  -3.508 0.004900 ** 
## V9  -3.78469    0.67053  -5.644 0.000150 ***
## V10 -2.86549    0.66807  -4.289 0.001279 ** 
## V11  0.02942    0.67016   0.044 0.965770    
## V12  0.81295    0.66859   1.216 0.249468    
## V13  2.66214    0.66990   3.974 0.002181 ** 
## V14  1.83883    0.66912   2.748 0.018953 *  
## V15  3.90480    0.66956   5.832 0.000114 ***
## V16  2.68855    0.66956   4.015 0.002032 ** 
## V17  2.63589    0.66912   3.939 0.002315 ** 
## V18  0.99455    0.66990   1.485 0.165728    
## V19  0.61642    0.66859   0.922 0.376327    
## V20 -0.78277    0.67016  -1.168 0.267483    
## V21 -1.20588    0.66807  -1.805 0.098487 .  
## V22 -2.92490    0.67053  -4.362 0.001132 ** 
## V23 -1.14061    0.66831  -1.707 0.115912    
## V24  0.20689    0.67359   0.307 0.764472    
## V25  1.44574    0.67808   2.132 0.056374 .  
## V26  4.00722    0.70974   5.646 0.000150 ***
## V27  0.99627    0.79853   1.248 0.238073    
## V28  1.86743    1.12009   1.667 0.123659    
## V29 -3.49694    2.29614  -1.523 0.155984    
## V30 -0.22387    9.18901  -0.024 0.981000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3656 on 11 degrees of freedom
## Multiple R-squared:  0.9919, Adjusted R-squared:   0.97 
## F-statistic: 45.15 on 30 and 11 DF,  p-value: 5.957e-08
coefplot::coefplot(PSplines_smooth_overfit_fit) + theme_bw()

PSplines_smooth_overfit_fit %>% glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.992         0.970 0.366      45.2 5.96e-8    30   10.0  41.9  95.0
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Prediction

PSplines_overfit_test_basis_df <- PredictMat(PSplines_smooth_overfit, fine_df_1d) %>% as.data.frame() %>% tbl_df()
PSplines_overfit_test_pred <- predict(PSplines_smooth_overfit_fit, newdata = PSplines_overfit_test_basis_df)
fine_df_1d %>%
  mutate(y_pred = PSplines_overfit_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_point(mapping = aes(y = y), color = "red") +
  theme_bw()

#Using regularization

PSplines_fit_func_Reg <- function(df_1, data, lambda){
  Smooth <- smoothCon(s(x1, k=df_1, bs = 'ps'), data = data)[[1]]
  Smooth_basis <- Smooth$X
  Smooth_penalty <- Smooth$S
  
  info_list <- list(
    yobs = data$y, 
    basis_mat = as.matrix(Smooth_basis %>% as.data.frame() %>% tbl_df()),
    penalty_mat = as.matrix(Smooth_penalty %>% as.data.frame() %>% tbl_df()),
    lambda = lambda
  )
  
  start_guess = rnorm(n = ncol(info_list$basis_mat), mean = 0, sd = 2)
  
  fit <- optim(start_guess,
               minimize_error_func,
               gr = NULL,
               info_list,
               method = "BFGS",
               hessian = TRUE,
               control = list(fnscale = 1, maxit = 1001))
  
  return(list("fit" = fit, "Smooth" = Smooth))
}
PSplines_reg <- PSplines_fit_func_Reg(30, train_df_1d, 1.5)
PSplines_reg_test_basis_df <- PredictMat(PSplines_reg$Smooth, fine_df_1d) %>% as.data.frame() %>% tbl_df()
PSplines_reg_test_pred <- as.matrix(PSplines_reg_test_basis_df)  %*% as.matrix(PSplines_reg$fit$par)
fine_df_1d %>%
  mutate(y_pred = PSplines_reg_test_pred) %>%
  mutate(y_pred_unreg = PSplines_overfit_test_pred) %>%
  ggplot(mapping = aes(x = x1, y = y_pred)) +
  geom_line(size = 1.15) +
  geom_line(mapping = aes(y = y_pred_unreg), color = 'blue') +
  
  geom_point(mapping = aes(y = y), color = "red") +
  
  theme_bw()